# coding=utf-8

import re
import urllib

# 获取正常网页的title
def get_title(urlData, startStr="<title>", endStr="</title>"):
    reStr = startStr + ".*" + endStr
    pattern = re.compile(reStr, re.S)
    match = pattern.findall(urlData)
    if match == None:
        return "Title is None!"
    else:
        for i in match:
            i = i[len(startStr):-len(endStr)].replace("	", "").strip("\n")    # 去掉换行或者"	"
            return i

# 获取正常网页的keywords
def get_keywords(urlData):
    startStr1 = "<meta name=\"keywords\" content=\""
    startStr2 = "<meta name=\"Keywords\" content=\""
    endStr = "\" />"
    reStr = (startStr1 + ".*?" + endStr + "|" + startStr2 + ".*?" + endStr)
    pattern = re.compile(reStr, re.I)
    match = pattern.findall(urlData)
    if match == None:
        return "Keywords is None!"
    else:
        for i in match:
            i = i[len(startStr2):-len(endStr)]
            return i

# 获取正常网页的description
def get_description(urlData):
    startStr1 = "<meta name=\"Description\" content=\""
    startStr2 = "<meta name=\"description\" content=\""
    endStr = "\" />"
    reStr = (startStr1 + ".*?" + endStr + "|" + startStr2 + ".*?" + endStr)
    pattern = re.compile(reStr, re.I)
    match = pattern.findall(urlData)
    if match == None:
        return "Description is None!"
    for i in match:
        i = i[len(startStr1):-len(endStr)]
        return i

# data = get_keywords(urllib.urlopen("http://www.boc.cn/").read())
# print data
